#!/usr/bin/env perl
# by lhtk : lhtk80t7@gmail.com
#
# filter blast results by perc_identity and qcov_hsp_perc
#
use strict;
use warnings;

use autodie qw/open close/;
use Getopt::Long;
use JSON qw/to_json/;

BEGIN {
    if (defined $ENV{MOJO_HOME}) {
        unshift @INC, "$ENV{MOJO_HOME}/lib";
    } else {
        unshift @INC, "./lib";
    }
}
use ShareSubs qw#get_conf load_done update_done json2href#;

# -------------------------------------------
# mod_perl
our $home_dir = $ENV{MOJO_HOME};
$home_dir = '.' unless (defined $home_dir and -d $home_dir);
# ===========================================
# some settings
my $conf_dir = join '/', $home_dir, 'conf/b2dsc';
our $conf = get_conf(join '/', $conf_dir, 'b2dsc.json');
$conf->{known_probes} = get_conf(join '/', $conf_dir, "known_probes.json");
my $data_dir = join '/', $home_dir, $conf->{data_dir};
# 直接把 json 数据写到 js 文件中, 2017.4.6
our $public_tmp_dir = join '/', $home_dir, $conf->{public_tmp_dir};

# 收集运行参数等信息
our $info; # information, from or to done.json
# ===========================================
# command line args
# 
my $job_id;
our $pIdent;
our $qCover;
# 要么所有一起，要么某一个
our $all; # flag，所有的一起？
our $id;  # 某一个

# 或许应当添加错误检查，web 调用的时候不输出？还是输出到 log 里？
GetOptions(
    'jid|j=s' => \$job_id,
    'perc_identity|p=f' => \$pIdent,
    'qcov_hsp_perc|q=f' => \$qCover,
    'all|a' => \$all,
    'id|i=s' => \$id,
) or die "Error in command line arguments!\n";

# 错误检查或许不合适
unless (defined $job_id and defined $pIdent and defined $qCover) {
    print STDERR "\nProblems with arguments! \n未设定 job id / pIdent / qCover 值。\n\n";
    exit 1;
}
unless (in_range($pIdent,0,100) and in_range($qCover,0,100)) {
    print STDERR "\n pIdent/ qCover 数值不合适。\n\n";
    exit 1;
}
unless ($all or defined $id) {
    print "需要至少设定 --all 或 --id some_id 中的一个。\n\n";
    exit 1;
}

our $workdir = "$data_dir/$job_id";
our $public_job_dir = "$public_tmp_dir/$job_id";
mkdir $public_job_dir unless -d $public_job_dir;
# -----------------------------------------
# 收集信息

$info = load_done($job_id, $public_tmp_dir);

# 不能比原初值小
$pIdent = $info->{perc_identity} if $pIdent <= $info->{perc_identity};
$qCover = $info->{qcov_hsp_perc} if $qCover <= $info->{qcov_hsp_perc};

@{ $info }{qw/pIdent qCover/} = ($pIdent, $qCover);
if ($all) {
    $info->{just_filtered} = 'All of them';
} else {
    $info->{just_filtered} = $id;
}
# -----------------------------------------
# 如果已经存在，直接结束 -- 放在在 Blastn.pm 里了
our $pIqC = $pIdent . '_' . $qCover;

filter_bls_tsv();

$info->{last_task} = $info->{status};
$info->{status} = 'free';

# 更新 done.json
update_done($info, $job_id, $public_tmp_dir);

# ===========================================
# subroutines
# ===========================================

# [0, 100]
sub in_range {
    my ($num, $from, $to) = @_;
    if ($num >= $from and $num <= $to) {
        return 1;
    } else {
        return 0;
    }
}

# 是不是考虑还不周全？对于交错
sub filter_bls_tsv {
    our ($info, $pIdent, $qCover, $conf);
    our ($pIqC, $all, $id);
    our $public_job_dir;
    our $public_tmp_dir;
    my $species = $info->{species};
    # 复制一份
    my %tsvs = %{ $info->{blastn} }; # tsv 此前已经排过序，这里不再排

    unless ($all) {
        %tsvs = ( $id => $tsvs{$id} );
    }

    # known probes 暂时注释掉
    if ((exists $info->{id_map}) and (exists $info->{id_map}{known_probes})) {
        my $kp = $info->{id_map}{known_probes}; # sid => kp_id
        my $jid_kp = $conf->{known_probes}{$species};
    #     my $done_kp = load_done($jid_kp, $data_dir);
        my $done_kp = load_done($jid_kp, $public_tmp_dir);
        for my $sid (keys %tsvs) {
            if (exists $kp->{$sid}) {
                my $kp_id = $kp->{$sid};
                if (exists $done_kp->{filter_bls}{$pIqC}{$kp_id}) {
                    $info->{filter_bls}{$pIqC}{$sid} = $done_kp->{filter_bls}{$pIqC}{$kp_id};
                    $info->{max_cnts}{$pIqC}{$sid} = $done_kp->{max_cnts}{$pIqC}{$kp_id};
                    delete $tsvs{$sid};
                }
            }
        }

        return unless %tsvs;
    }

    for my $sid (keys %tsvs) {
        my $max_cnt = 0;
        my $tsv_i = join '/', $home_dir, $tsvs{$sid};
        
        my $count_json = "$public_job_dir/$sid-$pIqC-count.json";
        # jsons => 一堆 json 的集合，不是纯 json
        my $hsps_jsons = "$public_job_dir/$sid-$pIqC-hsps.jsons";
        $info->{filter_bls}{$pIqC}{$sid}{count} = $count_json;
        $info->{filter_bls}{$pIqC}{$sid}{hsps} = $hsps_jsons;
        $info->{filter_bls}{$pIqC}{$sid}{count} =~ s#^\Q$home_dir\E/public##;
        $info->{filter_bls}{$pIqC}{$sid}{hsps} =~ s#^\Q$home_dir\E/public##;

        open my $fh_i, '<', $tsv_i;
        open my $fh_o_count, '>', $count_json;
        open my $fh_o_hsps, '>', $hsps_jsons;

        my %counts;

        my $h = <$fh_i>;
        chomp $h;
        my @headers = split /\t/, $h;

        my %pre; # 上一个 hsp 的信息
        my %hsps; # start, end, strand
        while (<$fh_i>) {
            next if /^\s*$/;
            chomp;
            my %tmp;
            @tmp{@headers} = split /\t/, $_;

            next unless ($tmp{pident} >= $pIdent and $tmp{qcovhsp} >= $qCover);

            if (%pre) {
                next if ($pre{sseqid} eq $tmp{sseqid}) and ($pre{sstrand} eq $tmp{sstrand}) and ($tmp{sstart} <= $pre{send}); # 交错则跳过
            }

            @pre{@headers} = @tmp{@headers};
            my $pos = int($tmp{sstart} / 1e6); # Mbp -- 注意这里是 0 - 999_999, 1_000_000 - 1_999_999
            my $pos100K = int( ($tmp{sstart} % 1e6) / 1e5 ); # 直接得到 100 Kbp 分布

            push @{ $hsps{ $tmp{sseqid} }{$pos}{ $tmp{sstrand} }{$pos100K} }, [$tmp{sstart} % 1e5, $tmp{send} % 1e5];

            my($hgrp,$genome) = ($tmp{sseqid} =~ /^(\d+)(\w+)$/);
            $counts{$genome}{$hgrp}{$pos}++;
            my $t_cnt = $counts{$genome}{$hgrp}{$pos}; # tmp
            $max_cnt = ($t_cnt > $max_cnt) ? $t_cnt : $max_cnt;
        }
        close $fh_i;
        $info->{max_cnts}{$pIqC}{$sid} = $max_cnt;
        print $fh_o_count to_json(\%counts, {utf8 => 1});
        close $fh_o_count;

        while (my($k,$v) = each %hsps) {
            print $fh_o_hsps ">$k\n";
            while (my($k1,$v1) = each %$v) {
                print $fh_o_hsps "$k1 " . to_json(\%$v1, {utf8 => 1}) . "\n";
            }
        }
        close $fh_o_hsps;
    }
}
